Team 5: Amit and Ravit¶

In [1]:
from plotly.offline import  init_notebook_mode
init_notebook_mode()

Task 4:¶

In [4]:
import pandas as pd


df_GeneralData = pd.read_excel(r'src\data\rehospitalization.xlsx', sheet_name='GeneralData')

mean_weight = df_GeneralData['משקל'].mean()
df_GeneralData['משקל'].fillna(mean_weight, inplace=True)

mean_height = df_GeneralData['גובה'].mean()
df_GeneralData['גובה'].fillna(mean_weight, inplace=True)

# Calculate BMI Column
df_GeneralData.loc[df_GeneralData['BMI'].isna(), 'BMI'] = df_GeneralData.apply(
    lambda row: row['משקל'] / ((row['גובה'] /100 )**2) if pd.notnull(row['משקל']) and pd.notnull(row['גובה']) else None,
    axis=1
)

# Filling empty values in the column in the value "לא ידוע"
df_GeneralData['השכלה'] = df_GeneralData['השכלה'].apply(lambda x: None if isinstance(x, (int, float)) else x)
df_GeneralData['השכלה'].fillna('לא ידוע', inplace=True)


# Convert 'מספר ילדים' to numeric, setting non-numeric values to NaN
df_GeneralData['מספר ילדים'] = pd.to_numeric(df_GeneralData['מספר ילדים'], errors='coerce')
# Calculate the median of the numeric values, ignoring NaN values
median_value = df_GeneralData['מספר ילדים'].median()
# Replace non-numeric (NaN) values with the median
df_GeneralData['מספר ילדים'].fillna(median_value, inplace=True)

marital_status_mapping = {
    'לא ידוע': 1,
    'רווק': 2,
    'נשוי': 3,
    'פרוד': 4,
    'גרוש': 5,
    'אלמן': 6
}

# Apply the mapping to the 'מצב משפחתי' column
df_GeneralData['מצב משפחתי'] = df_GeneralData['מצב משפחתי'].map(marital_status_mapping)



# Save the updated DataFrame to a new Excel file
output_file_path = 'src\data\GeneralData.csv'
df_GeneralData.to_csv(output_file_path, index = False, encoding='utf-8-sig')

Task 14: erBeforeHospitalization2 EDA:¶

In [6]:
import pandas as pd
import plotly.express as px
import plotly.io as pio

# Function to create and customize a histogram plot 
def create_plot(df: pd.DataFrame, x_col: str, title: str):
    fig1 = px.histogram(df, x=x_col, barmode="group"). \
        update_layout(
            yaxis_title="Count",         # Set the y-axis title
            title=title,                 # Set the plot title
            title_font_size=30           # Set the font size for the title
        )
    fig1.update_xaxes(tickfont_size=20, title=" ")  
    fig1.update_yaxes(title_font={"size": 20})      
    fig1.update_traces(opacity=0.9)                 
    fig1.show()                                     

# Load data 
data = pd.read_excel(r'src\data\rehospitalization.xlsx', sheet_name='erBeforeHospitalization2')

# Task 14:
# Print the data types of the columns in the dataset
print('Data Types: \n', data.dtypes)

# Print the number of unique patients in the dataset
print('Unique number of patients: ', len(data['Patient'].unique()))

# Calculate the admission time in days for the first hospitalization event
data['admission_time_1'] = (data['Release_Date'] - data['Admission_Entry_Date']).dt.days

# Histograms
create_plot(data, 'admission_time_1', 'Histogram of Admission Time (days)')
create_plot(data, 'מחלקה מאשפזת1', 'Patients by Admission Department')

# Calculate the ER time in hours for the second hospitalization event
data['admission_time_2'] = (data['ev_Release_Time'] - data['ev_Admission_Date']).dt.total_seconds() / 3600

# Histograms
create_plot(data, 'admission_time_2', 'Histogram of ER Time (hours)')
create_plot(data, 'דרך הגעה למיון', 'ER Arrival Histogram')
create_plot(data, 'מיון', 'ER Type Histogram')
Data Types: 
 Patient                               int64
Admission_Medical_Record              int64
Admission_Entry_Date         datetime64[ns]
Release_Date                 datetime64[ns]
מחלקה מאשפזת1                         int64
Medical_Record                      float64
ev_Admission_Date            datetime64[ns]
ev_Release_Time              datetime64[ns]
דרך הגעה למיון                       object
מיון                                 object
urgencyLevelTime                    float64
אבחנות במיון                         object
codeDoctor                          float64
Admission_Medical_Record2             int64
Admission_Entry_Date2        datetime64[ns]
Release_Date2                datetime64[ns]
מחלקה מאשפזת2                        object
dtype: object
Unique number of patients:  4515

Task 16: optimal distribution:¶

In [8]:
import pandas as pd
import plotly.express as px
import plotly.io as pio

# Function to create and customize a histogram plot
def create_plot(df: pd.DataFrame, x_col: str, title: str):
    fig1 = px.histogram(df, x=x_col, barmode="group"). \
        update_layout(
            yaxis_title="Count",         # Set the y-axis title
            title=title,                 # Set the plot title
            title_font_size=30           # Set the font size for the title
        )
    fig1.update_xaxes(tickfont_size=20, title=" ")  
    fig1.update_yaxes(title_font={"size": 20})      
    fig1.update_traces(opacity=0.9)                 
    fig1.show()                                     

# Function to convert a continuous variable into categories based on percentiles
def create_categories_by_precentile(data: pd.DataFrame, groups_num: int):
    data['range_days_between_admissions'] = pd.qcut(data['days_between_admissions'], q=groups_num).astype('str')  # Create percentile-based bins for the 'days_between_admissions' column
    data['range_days_between_admissions'] = data['range_days_between_admissions'].str.slice(start=1, stop=-1)
    bins_map = {value: idx for idx, value in enumerate(data['range_days_between_admissions'].unique())}
    data['category_days_between_admissions'] = data['range_days_between_admissions'].map(bins_map) # Map the ranges to their corresponding category index
    return data

# Load data 
data = pd.read_excel(r'src\data\rehospitalization.xlsx', sheet_name='erBeforeHospitalization2')

# Task 16:
# Calculate the number of days between the first and second admissions
data['days_between_admissions'] = (data['Admission_Entry_Date2'] - data['Release_Date']).dt.days

# Histogram 
create_plot(data, 'days_between_admissions', 'Histogram of Days Passed Between Admissions')
print("We can see that the days between admissions are from 0 to 29, most of them are 0-7 days.")
# Convert the continuous 'days_between_admissions' into categories based on percentiles
data = create_categories_by_precentile(data, 5)
# Histogram
create_plot(data, 'category_days_between_admissions', 'Histogram of Days Passed Between Admissions by Category')
print("We can see the new distribution for 5 categories by precentile.")
print("The categories are:")
print('Categories created by days:', data['range_days_between_admissions'].unique())
We can see that the days between admissions are from 0 to 29, most of them are 0-7 days.
We can see the new distribution for 5 categories by precentile.
The categories are:
Categories created by days: ['8.0, 14.0' '21.0, 29.0' '-0.001, 4.0' '4.0, 8.0' '14.0, 21.0']

Task 28: Find Connections:¶

In [9]:
from scipy.stats import chi2_contingency


# Visualize relationships between categorical variables and re-hospitalization
def create_plot_relationship(df, x_col, title):
    fig = px.histogram(df, x=x_col, barmode='group'). \
        update_layout(yaxis_title="Number of Recordings",
                      title=title,
                      title_font_size=30)
    fig.update_xaxes(tickfont_size=20, title=" ")
    fig.update_yaxes(title_font={"size": 20})
    fig.update_traces(opacity=0.9)
    return fig

# Relationship between Education and Rehospitalization
create_plot_relationship(df_GeneralData, 'השכלה', 'Rehospitalization by Education Level')

# Relationship between Number of Children and Rehospitalization
create_plot_relationship(df_GeneralData, 'מספר ילדים', 'Rehospitalization by Number of Children')

# Relationship between Marital Status and Rehospitalization
create_plot_relationship(df_GeneralData, 'מצב משפחתי', 'Rehospitalization by Marital Status')

# Chi-square tests for relationships
def chi_square_test(df, col):
    contingency_table = pd.crosstab(df[col],data['days_between_admissions'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi-square test for {col}: Chi2 = {chi2}, p-value = {p}")

# Apply Chi-square tests
chi_square_test(df_GeneralData, 'השכלה')
chi_square_test(df_GeneralData, 'מספר ילדים')
chi_square_test(df_GeneralData, 'מצב משפחתי')
Chi-square test for השכלה: Chi2 = 18719.003793880023, p-value = 0.3535239781225367
Chi-square test for מספר ילדים: Chi2 = 399.64626372514425, p-value = 0.8868501584051448
Chi-square test for מצב משפחתי: Chi2 = 123.44004841993777, p-value = 0.30085187806882535

Task 32: Time Series Analysis:¶

In [13]:
import statsmodels.api as sm
import matplotlib.pyplot as plt


# Load data 
df_hospitalization1 = pd.read_excel(r'src\data\rehospitalization.xlsx', sheet_name='hospitalization1')

# Convert to datetime if not already
df_hospitalization1['Admission_Entry_Date'] = pd.to_datetime(df_hospitalization1['Admission_Entry_Date'])
df_hospitalization1['Release_Date'] = pd.to_datetime(df_hospitalization1['Release_Date'])

# Filter and group data for each department (1 to 5)
departments = [1, 2, 3, 4, 5]

# Function to create time series plots for admissions and releases
def create_time_series_plot(df, date_col, group_col, title):
    monthly_counts = df.groupby([pd.Grouper(key=date_col, freq='M'), group_col]).size().reset_index(name='Count')
    
    # Create time series plot
    fig = px.line(monthly_counts, x=date_col, y='Count', color=group_col,
                  title=title,
                  labels={date_col: 'Date', 'Count': 'Number of Admissions/Releases'})
    fig.update_layout(title_font_size=30)
    fig.show()

# Function to perform seasonal decomposition for a given department
def seasonal_decomposition(df, department_number):
    # Filter data for the department
    department_data = df[df['unitName1'] == department_number]
    
    # Admissions time series
    department_admissions = department_data.set_index('Admission_Entry_Date').resample('M').size()
    
    # Decomposition
    decomposition = sm.tsa.seasonal_decompose(department_admissions, model='additive')
    fig = decomposition.plot()
    fig.set_size_inches(14, 7)
    plt.suptitle(f'Seasonal Decomposition of Admissions for Department {department_number}', fontsize=16)
    plt.show()

# Loop through each department (1 to 5) for time series plots and decomposition
for department in departments:
    # Filter data for the department
    department_data = df_hospitalization1[df_hospitalization1['unitName1'] == department]
    
    # Time series plots for admissions and releases
    create_time_series_plot(department_data, 'Admission_Entry_Date', 'unitName1', f'Admissions Over Time for Department {department}')
    create_time_series_plot(department_data, 'Release_Date', 'unitName1', f'Releases Over Time for Department {department}')
    
    # Seasonal decomposition for admissions
    seasonal_decomposition(df_hospitalization1, department)

Conclusion Summary:¶

Admissions: The trend in admissions reflects both seasonal fluctuations and potential healthcare trends over time. Strong seasonality suggests that certain departments, particularly those handling acute conditions, are highly influenced by external factors like flu seasons or public health crises.

Releases: Releases generally follow the admissions pattern with some lag, but discrepancies between the two can point to inefficiencies in the hospital system or specific challenges in managing patient flow.

Irregularities: Large residuals in either admissions or releases indicate that there are external factors or one-off events (such as pandemics) that heavily influenced the normal patterns.

In conclusion, understanding the dynamic between admissions and releases is crucial for hospital management, as it enables better planning for resource allocation, staffing, and patient flow optimization, particularly in departments facing seasonal variations or external pressures.

In [ ]: